import geopandas as gpdSpatial Disparity Studio: Statistical Testing
Composition
sd = gpd.read_file('sdtracts.geojson', driver='GeoJSON')import seaborn as snssd = sd.fillna(value=0)vars = [ 'n_total_pop', 'n_nonhisp_white_persons',
'n_nonhisp_black_persons', 'n_hispanic_persons']
short = sd[vars]short.head()| n_total_pop | n_nonhisp_white_persons | n_nonhisp_black_persons | n_hispanic_persons | |
|---|---|---|---|---|
| 0 | 3093.0 | 2389.0 | 0.0 | 489.0 |
| 1 | 1891.0 | 1569.0 | 10.0 | 140.0 |
| 2 | 4542.0 | 3390.0 | 4.0 | 616.0 |
| 3 | 5239.0 | 3820.0 | 266.0 | 871.0 |
| 4 | 3801.0 | 2148.0 | 228.0 | 884.0 |
(short.n_total_pop==0).sum()2
den = short.n_total_pop
den = den + (1 * den==0)short['p_white'] = short.n_nonhisp_white_persons / den
short['p_black'] = short.n_hispanic_persons / den
short['p_hispanic'] = short.n_hispanic_persons / den/tmp/ipykernel_3461790/813278778.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
short['p_white'] = short.n_nonhisp_white_persons / den
/tmp/ipykernel_3461790/813278778.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
short['p_black'] = short.n_hispanic_persons / den
/tmp/ipykernel_3461790/813278778.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
short['p_hispanic'] = short.n_hispanic_persons / den
short.head()| n_total_pop | n_nonhisp_white_persons | n_nonhisp_black_persons | n_hispanic_persons | p_white | p_black | p_hispanic | |
|---|---|---|---|---|---|---|---|
| 0 | 3093.0 | 2389.0 | 0.0 | 489.0 | 0.772389 | 0.158099 | 0.158099 |
| 1 | 1891.0 | 1569.0 | 10.0 | 140.0 | 0.829720 | 0.074035 | 0.074035 |
| 2 | 4542.0 | 3390.0 | 4.0 | 616.0 | 0.746367 | 0.135623 | 0.135623 |
| 3 | 5239.0 | 3820.0 | 266.0 | 871.0 | 0.729147 | 0.166253 | 0.166253 |
| 4 | 3801.0 | 2148.0 | 228.0 | 884.0 | 0.565114 | 0.232570 | 0.232570 |
h = short.sort_values(by='p_hispanic')
h.head()| n_total_pop | n_nonhisp_white_persons | n_nonhisp_black_persons | n_hispanic_persons | p_white | p_black | p_hispanic | |
|---|---|---|---|---|---|---|---|
| 627 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 238 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 414 | 2957.0 | 2347.0 | 88.0 | 122.0 | 0.793710 | 0.041258 | 0.041258 |
| 460 | 3250.0 | 2809.0 | 28.0 | 145.0 | 0.864308 | 0.044615 | 0.044615 |
| 467 | 5584.0 | 4763.0 | 11.0 | 264.0 | 0.852973 | 0.047278 | 0.047278 |
h.shape(628, 7)
h['wcdf'] = h.n_nonhisp_white_persons.cumsum() / h.n_nonhisp_white_persons.sum()
h['hcdf'] = h.n_hispanic_persons.cumsum() / h.n_hispanic_persons.sum()
h['tcdf'] = h.n_total_pop.cumsum() / h.n_total_pop.sum()
h['bcdf'] = h.n_nonhisp_black_persons.cumsum() / h.n_nonhisp_black_persons.sum()
h.head()| n_total_pop | n_nonhisp_white_persons | n_nonhisp_black_persons | n_hispanic_persons | p_white | p_black | p_hispanic | wcdf | hcdf | tcdf | bcdf | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 627 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 238 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 414 | 2957.0 | 2347.0 | 88.0 | 122.0 | 0.793710 | 0.041258 | 0.041258 | 0.001554 | 0.000109 | 0.000892 | 0.000564 |
| 460 | 3250.0 | 2809.0 | 28.0 | 145.0 | 0.864308 | 0.044615 | 0.044615 | 0.003413 | 0.000239 | 0.001872 | 0.000743 |
| 467 | 5584.0 | 4763.0 | 11.0 | 264.0 | 0.852973 | 0.047278 | 0.047278 | 0.006566 | 0.000475 | 0.003556 | 0.000814 |
import seaborn as snsimport matplotlib.pyplot as plt
# plot lines
plt.plot(h.p_hispanic, h.hcdf, label = "Hispanic", linestyle='dashed', color='k')
plt.plot(h.p_hispanic, h.wcdf, label = "Non-Hispanic White", linestyle='dotted', color='k')
plt.plot(h.p_hispanic, h.bcdf, label = "Non-Hispanic Black", linestyle='dashdot', color='k')
plt.plot(h.p_hispanic, h.tcdf, label='All', color='k')
plt.xlabel('Percent Hispanic')
plt.ylabel('cdf')
plt.legend()
plt.show()
h[h.hcdf>=0.5].iloc[0]n_total_pop 31118.000000
n_nonhisp_white_persons 4191.000000
n_nonhisp_black_persons 970.000000
n_hispanic_persons 14901.000000
p_white 0.134681
p_black 0.478855
p_hispanic 0.478855
wcdf 0.908881
hcdf 0.503078
tcdf 0.749890
bcdf 0.721848
Name: 307, dtype: float64
phh50 = h[h.hcdf>=0.5].iloc[0].p_hispanicphh500.4788546821775178
phw50 = h[h.wcdf>=0.5].iloc[0].p_hispanicphw500.1966406023747466
import matplotlib.pyplot as plt
# plot lines
plt.plot(h.p_hispanic, h.hcdf, label = "Hispanic", linestyle='dashed', color='k')
plt.plot(h.p_hispanic, h.wcdf, label = "Non-Hispanic White", linestyle='dotted', color='k')
plt.plot(h.p_hispanic, h.bcdf, label = "Non-Hispanic Black", linestyle='dashdot', color='k')
plt.plot(h.p_hispanic, h.tcdf, label='All', color='k')
plt.xlabel('Percent Hispanic')
plt.ylabel('cdf')
plt.legend()
plt.plot([0, phh50], [.5, .5], linestyle='solid', color='k')
plt.plot([phh50, phh50], [0, .5], linestyle='dashed', color='k')
plt.plot([phw50, phw50], [0, .5], linestyle='dotted', color='k')
plt.show()
h = short.sort_values(by='p_white')
h.head()| n_total_pop | n_nonhisp_white_persons | n_nonhisp_black_persons | n_hispanic_persons | p_white | p_black | p_hispanic | |
|---|---|---|---|---|---|---|---|
| 627 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 238 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 246 | 5199.0 | 40.0 | 22.0 | 5037.0 | 0.007694 | 0.968840 | 0.968840 |
| 243 | 6978.0 | 118.0 | 154.0 | 6517.0 | 0.016910 | 0.933935 | 0.933935 |
| 565 | 4825.0 | 93.0 | 20.0 | 4614.0 | 0.019275 | 0.956269 | 0.956269 |
h['wcdf'] = h.n_nonhisp_white_persons.cumsum() / h.n_nonhisp_white_persons.sum()
h['hcdf'] = h.n_hispanic_persons.cumsum() / h.n_hispanic_persons.sum()
h['tcdf'] = h.n_total_pop.cumsum() / h.n_total_pop.sum()
h['bcdf'] = h.n_nonhisp_black_persons.cumsum() / h.n_nonhisp_black_persons.sum()
h.head()| n_total_pop | n_nonhisp_white_persons | n_nonhisp_black_persons | n_hispanic_persons | p_white | p_black | p_hispanic | wcdf | hcdf | tcdf | bcdf | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 627 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 238 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 246 | 5199.0 | 40.0 | 22.0 | 5037.0 | 0.007694 | 0.968840 | 0.968840 | 0.000026 | 0.004507 | 0.001568 | 0.000141 |
| 243 | 6978.0 | 118.0 | 154.0 | 6517.0 | 0.016910 | 0.933935 | 0.933935 | 0.000105 | 0.010339 | 0.003672 | 0.001128 |
| 565 | 4825.0 | 93.0 | 20.0 | 4614.0 | 0.019275 | 0.956269 | 0.956269 | 0.000166 | 0.014468 | 0.005127 | 0.001256 |
phw50 = h[h.hcdf>=0.5].iloc[0].p_white
pww50 = h[h.wcdf>=0.5].iloc[0].p_whitephw50, pww50(0.27341929470250115, 0.6068664417543164)
import matplotlib.pyplot as plt
# create data
# plot lines
plt.plot(h.p_white, h.hcdf, label = "Hispanic", linestyle='dashed', color='k')
plt.plot(h.p_white, h.wcdf, label = "Non-Hispanic White", linestyle='dotted', color='k')
plt.plot(h.p_white, h.bcdf, label = "Non-Hispanic Black", linestyle='dashdot', color='k')
plt.plot(h.p_white, h.tcdf, label='All', color='k')
plt.xlabel('Percent Non-Hispanic White')
plt.ylabel('cdf')
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.plot([phw50, phw50], [0, .5], linestyle='dashed', color='k')
plt.plot([pww50, pww50], [0, .5], linestyle='dotted', color='k')
plt.plot([0, pww50], [.5, .5], linestyle='solid', color='k')
plt.legend()
plt.show()
Difference in Population Proportions
Research Question
Is there a significant difference between the Hispanic composition of the tract populations between tracts that intersect and do not intersect the highway buffer?
Parameters of Interest: \(p1 - p2\) where \(p1\) is the percentage Hispanic for tracts that do not intersect the buffer, \(p2\) the percentage Hispanic for tracts to do intersect the buffer.
Null Hypothesis: \(p1-p2=0\)
Alternative Hypothesis: \(p1-p2<0\)
estimates = gpd.read_file('estimate.geojson', driver='GeoJSON')estimates.head()| n_total_pop | n_nonhisp_white_persons | n_nonhisp_black_persons | n_hispanic_persons | income | median_household_income | per_capita_income | pci | geometry | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.562176e+05 | 3.215112e+05 | 35412.835692 | 292432.788563 | 2.759467e+10 | 77579.151075 | 37818.115972 | 36490.385567 | MULTIPOLYGON (((475814.243 3628586.622, 475809... |
| 1 | 2.559855e+06 | 1.189245e+06 | 120671.164353 | 825084.214846 | 9.865887e+10 | 66429.266067 | 32423.814728 | 38540.797907 | MULTIPOLYGON (((480756.394 3599073.688, 480752... |
estimates['p_hispanic_persons'] = estimates.n_hispanic_persons / estimates.n_total_popestimates.p_hispanic_persons0 0.386705
1 0.322317
Name: p_hispanic_persons, dtype: float64
Thus \(\hat{p1} = 32.32\)% and \(\hat{p2}=38.67\)%. The percentage Hispanic is higher in the buffer than outside the buffer.
import statsmodels.api as sm
import numpy as np
# https://tirthajyoti.github.io/Intro_Hypothesis_Testing.htmlcount2 = estimates.n_hispanic_persons[0] # inside
nobs2 = estimates.n_total_pop[0]
count1 = estimates.n_hispanic_persons[1]
nobs1 = estimates.n_total_pop[1]
np.random.seed(12345)
size=1000
pop1 = np.random.binomial(1, count1/nobs1, size)
pop2 = np.random.binomial(1, count2/nobs2, size)sm.stats.ttest_ind(pop1, pop2) # pct hispanic outside buffer versus inside buffer(-2.797459637563787, 0.005199988209663126, 1998.0)
The function returns three values: (a) the test statistic, (b) the p-value, (c) the degrees of freedom used in the test.
sm.stats.ttest_ind?Signature: sm.stats.ttest_ind( x1, x2, alternative='two-sided', usevar='pooled', weights=(None, None), value=0, ) Docstring: ttest independent sample Convenience function that uses the classes and throws away the intermediate results, compared to scipy stats: drops axis option, adds alternative, usevar, and weights option. Parameters ---------- x1 : array_like, 1-D or 2-D first of the two independent samples, see notes for 2-D case x2 : array_like, 1-D or 2-D second of the two independent samples, see notes for 2-D case alternative : str The alternative hypothesis, H1, has to be one of the following * 'two-sided' (default): H1: difference in means not equal to value * 'larger' : H1: difference in means larger than value * 'smaller' : H1: difference in means smaller than value usevar : str, 'pooled' or 'unequal' If ``pooled``, then the standard deviation of the samples is assumed to be the same. If ``unequal``, then Welch ttest with Satterthwait degrees of freedom is used weights : tuple of None or ndarrays Case weights for the two samples. For details on weights see ``DescrStatsW`` value : float difference between the means under the Null hypothesis. Returns ------- tstat : float test statistic pvalue : float pvalue of the t-test df : int or float degrees of freedom used in the t-test File: /opt/tljh/user/lib/python3.9/site-packages/statsmodels/stats/weightstats.py Type: function
pop1.sum(), pop2.sum() # a random sample of size 1000 from each of the areas (buffer and outside) with the estimated sample proportions generated 331 Hispanic population outside and 391 inside.(331, 391)
Conclusion of the hypothesis test
Since the p-value is below 0.05, we reject the null hypothesis of similar Hispanic composition for tracts intersecting and not intersecting the buffer in favor of the alternative that the Hispanic composition is lower in tracts that do not intersect the highway buffer.
Testing for Socioeconomic Disparities
Testing if the median household income is different between the population in the buffer and outside the buffer can’t rely on the same approach we used for the proportions.
This is because we need to have some measure of variation within each of these areas that we don’t have in the aggregate estimates.
buffer = gpd.read_file('buffer.geojson', driver='GeoJSON')buffer.plot()<Axes: >

sd.plot()<Axes: >

buffer['buffer'] = 1faces = sd.overlay(buffer, how='union')
faces.reset_index(inplace=True)/opt/tljh/user/lib/python3.9/site-packages/geopandas/geodataframe.py:2451: UserWarning: `keep_geom_type=True` in overlay resulted in 150 dropped geometries of different geometry types than df1 has. Set `keep_geom_type=False` to retain all geometries
return geopandas.overlay(
faces.shape(1121, 161)
faces['buffer']0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
...
1116 NaN
1117 NaN
1118 NaN
1119 NaN
1120 1.0
Name: buffer, Length: 1121, dtype: float64
faces['buffer'] = faces['buffer'].fillna(0)faces['buffer']0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
...
1116 0.0
1117 0.0
1118 0.0
1119 0.0
1120 1.0
Name: buffer, Length: 1121, dtype: float64
sd.shape(628, 159)
faces.plot(column='buffer', categorical=True, legend=True)<Axes: >

faces.groupby(by='buffer').count()| index | geoid | n_mexican_pop | n_cuban_pop | n_puerto_rican_pop | n_russian_pop | n_italian_pop | n_german_pop | n_irish_pop | n_scandaniavian_pop | ... | p_poverty_rate_over_65 | p_poverty_rate_children | p_poverty_rate_white | p_poverty_rate_black | p_poverty_rate_hispanic | p_poverty_rate_native | p_poverty_rate_asian | year | _dvar | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| buffer | |||||||||||||||||||||
| 0.0 | 628 | 628 | 628 | 628 | 628 | 628 | 628 | 628 | 628 | 628 | ... | 628 | 628 | 628 | 628 | 628 | 628 | 628 | 628 | 628 | 628 |
| 1.0 | 493 | 492 | 492 | 492 | 492 | 492 | 492 | 492 | 492 | 492 | ... | 492 | 492 | 492 | 492 | 492 | 492 | 492 | 492 | 492 | 493 |
2 rows × 160 columns
faces.head()| index | geoid | n_mexican_pop | n_cuban_pop | n_puerto_rican_pop | n_russian_pop | n_italian_pop | n_german_pop | n_irish_pop | n_scandaniavian_pop | ... | p_poverty_rate_children | p_poverty_rate_white | p_poverty_rate_black | p_poverty_rate_hispanic | p_poverty_rate_native | p_poverty_rate_asian | year | _dvar | buffer | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 06073000100 | 278.0 | 19.0 | 9.0 | 42.0 | 125.0 | 86.0 | 65.0 | 11.0 | ... | 0.000000 | 1.616554 | 0.00000 | 1.260912 | 0.000000 | 0.0 | 2019.0 | 1.0 | 1.0 | MULTIPOLYGON (((482146.587 3624430.270, 482144... |
| 1 | 1 | 06073000201 | 95.0 | 11.0 | 0.0 | 23.0 | 44.0 | 32.0 | 64.0 | 0.0 | ... | 0.581703 | 10.470650 | 0.00000 | 1.322052 | 0.581703 | 0.0 | 2019.0 | 1.0 | 1.0 | POLYGON ((483329.845 3624434.530, 483414.898 3... |
| 2 | 2 | 06073000202 | 372.0 | 36.0 | 36.0 | 63.0 | 148.0 | 160.0 | 37.0 | 0.0 | ... | 0.468645 | 3.347467 | 0.00000 | 0.848025 | 0.000000 | 0.0 | 2019.0 | 1.0 | 1.0 | POLYGON ((482778.615 3623152.865, 482785.384 3... |
| 3 | 3 | 06073000300 | 757.0 | 0.0 | 43.0 | 7.0 | 87.0 | 209.0 | 192.0 | 0.0 | ... | 0.217477 | 11.111111 | 0.90945 | 2.412021 | 0.000000 | 0.0 | 2019.0 | 1.0 | 1.0 | POLYGON ((485076.654 3623400.232, 485141.579 3... |
| 4 | 4 | 06073000400 | 701.0 | 0.0 | 11.0 | 29.0 | 66.0 | 61.0 | 16.0 | 0.0 | ... | 0.000000 | 4.068462 | 1.37486 | 1.206510 | 0.000000 | 0.0 | 2019.0 | 1.0 | 1.0 | POLYGON ((484058.709 3624558.267, 484249.019 3... |
5 rows × 161 columns
Now we have faces classified as within and outside of the buffer.
faces[['median_household_income', 'buffer']].groupby(by='buffer').mean()| median_household_income | |
|---|---|
| buffer | |
| 0.0 | 82819.480892 |
| 1.0 | 82525.831301 |
faces.explore(column='median_household_income', legend=True, tooltip=['buffer', 'median_household_income'])